rm(list = ls())
# Use pacman to check whether packages are installed, if not load
if (!require("pacman")) install.packages("pacman")
library(pacman)
# Install packages
pacman::p_load(
here,
rio,
dplyr,
kableExtra,
patchwork,
DescTools,
tidyverse,
janitor,
survival,
naniar,
ggmice,
dplyr,
knitr
)External validation, recalibration, and clinical utility of the prognostic model kidney failure risk equation in patients with CKD stages G3-4: a nationwide retrospective cohort analysis in Peru
Main Analysis - Winsorization 1.5%: 1. Initial data analysis
1 Setup
2 Cargar datos
Los datos completos se muestran a continuación, luego de seleccionar a las variables con las que trabajaremos:
# Import data
data <- readRDS(here::here("Data", "Tidy", "integrated_total_data.rds")) |>
select(cas, sex, age, hta, dm, crea,
ckd_stage, ckd_stage2,
eGFR_ckdepi, acr, urine_album,
urine_crea, time5y, eventd5y,
grf_cat, acr_cat, ckd_class,
death2y, eventd2ylab, death5y,
eventd5ylab, eventd, eventdf, time)
data |>
glimpse()Rows: 152,084
Columns: 24
$ cas <chr> "Arequipa", "Arequipa", "Arequipa", "Arequipa", "Arequipa"…
$ sex <fct> Masculino, Masculino, Masculino, Masculino, Masculino, Fem…
$ age <dbl> 15, 15, 15, 15, 15, 18, 15, 16, NA, NA, NA, 22, 21, NA, 22…
$ hta <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, NA, 0, 1, 0, 1, …
$ dm <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, NA, 0, 0, NA, 0, 0, 0, 1,…
$ crea <dbl> 0.99, 1.20, 0.97, 0.91, 0.81, 0.76, 0.88, 1.20, 0.75, 0.92…
$ ckd_stage <fct> Stages 1-2 y 5, Stages 1-2 y 5, Stages 1-2 y 5, Stages 1-2…
$ ckd_stage2 <fct> Stages 1-3 y 5, Stages 1-3 y 5, Stages 1-3 y 5, Stages 1-3…
$ eGFR_ckdepi <dbl> 113.08736, 89.62041, 115.91243, 125.21489, 132.51473, 114.…
$ acr <dbl> NA, NA, NA, NA, NA, NA, 0.2522603, NA, NA, NA, NA, NA, 392…
$ urine_album <dbl> NA, NA, NA, NA, NA, NA, 22.60, NA, NA, NA, NA, NA, 196.94,…
$ urine_crea <dbl> NA, NA, NA, NA, NA, NA, 89.5900, NA, 278.1400, 392.2000, 1…
$ time5y <dbl> 5.0000000, 5.0000000, 5.0000000, 5.0000000, 5.0000000, 4.0…
$ eventd5y <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ grf_cat <fct> G1, G2, G1, G1, G1, G1, G1, G2, NA, NA, NA, G3b, G1, NA, G…
$ acr_cat <fct> NA, NA, NA, NA, NA, NA, A1, NA, NA, NA, NA, NA, A3, NA, A3…
$ ckd_class <fct> NA, NA, NA, NA, NA, NA, Low risk, NA, NA, NA, NA, NA, High…
$ death2y <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ eventd2ylab <chr> "Alive w/o Kidney Failure", "Alive w/o Kidney Failure", "A…
$ death5y <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ eventd5ylab <chr> "Alive w/o Kidney Failure", "Alive w/o Kidney Failure", "A…
$ eventd <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ eventdf <chr> "Alive without Kidney Failure", "Alive without Kidney Fail…
$ time <dbl> 7.4934976, 7.6933607, 7.4934976, 7.9041752, 7.9041752, 4.0…
3 Initial Data Analysis
We will begin by identifying implausible time data. It is evident that all implausible time data belong to individuals who experienced kidney failure, and this implausibility is due to negative event times. Regarding missing data, we observe that time data is missing in both individuals who did not report kidney failure and those who did. As expected, there were no missing time data for individuals who passed away, since these records come from the insured’s office, which cross-references data with RENIEC. Additionally, a significant group of individuals had missing time and event data (it is unknown whether they developed kidney failure or passed away).
data |>
bind_rows(data |>
mutate(eventdf = as.character("Total"))) |>
mutate(eventdf = factor(eventdf,
levels = c("Alive without Kidney Failure",
"Kidney Failure",
"Dead without Kidney Failure",
"Total",
"Missing Data"))) |>
count(time, eventdf) |>
mutate(time = case_when(is.na(time) ~ 15,
TRUE ~ time),
time_tipo = case_when(time > 0 & time < 15 ~ "Plausible Value",
time <= 0 ~ "Implausible Value",
time == 15 ~ "Missing Data",
TRUE ~ as.character(NA))) |>
ggplot(aes(x = time, y = n, color = time_tipo)) +
geom_point(shape = 21, alpha = 0.5) +
geom_vline(xintercept = 0.1, linetype = 2, color = "red") +
scale_y_continuous(trans = "log10") +
labs(color = "Data type") +
facet_wrap(. ~ eventdf) +
theme_bw() -> p_tiempo_perdido
ggsave(filename = "p_tiempo_perdido.jpeg",
plot = p_tiempo_perdido,
device = "jpeg",
path = here("Figures", "Sensitivity-woWinsorize", "Imputation_Diagnostic"),
scale = 2.5,
width = 9,
height = 9,
units = "cm",
dpi = 1000,
bg = "white") Regarding the distribution of this missing data, we have the following:
data |>
mutate(time_tipo = case_when(time > 0 & time < 15 ~ "Valor Plausible",
time <= 0 | time >= 15 ~ "Valor Implausible",
is.na(time) ~ "Dato perdido",
TRUE ~ as.character(NA))) |>
tabyl(time_tipo) |>
adorn_pct_formatting() -> tabla_tiempo_perdidos
tabla_tiempo_perdidos |>
kbl() |>
kable_styling()| time_tipo | n | percent |
|---|---|---|
| Dato perdido | 18584 | 12.2% |
| Valor Implausible | 534 | 0.4% |
| Valor Plausible | 132966 | 87.4% |
The percentage of missing time data is 12.2% (n = 1.8584^{4}). Additionally, the number of implausible time values is minimal, representing 0.4% (n = 534).
4 Selection of individuals for analysis
- The number of individuals under 18 years old is as follows:
data_eleg <- data |>
filter(age >= 18, ckd_stage == "Stages 3-4")
nrow(data_eleg )[1] 30488
data_noeleg_age_menos18 <- data |>
filter(age < 18)
nrow(data_noeleg_age_menos18)[1] 145
- The number of individuals with missing age data is as follows:
data_noeleg_age_na <- data |>
filter(is.na(age))
nrow(data_noeleg_age_na)[1] 18610
- The number of individuals with a CKD diagnosis other than stage 3a-4 is as follows:
data_noeleg_ckd12 <- data |>
filter(ckd_stage == "Stages 1-2 y 5", eGFR_ckdepi >= 60)
nrow(data_noeleg_ckd12)[1] 98185
data_noeleg_ckd5 <- data |>
filter(ckd_stage == "Stages 1-2 y 5", eGFR_ckdepi < 15)
nrow(data_noeleg_ckd5)[1] 773
- The number of individuals with missing CKD diagnosis data is as follows:
data_noeleg_ckd_na <- data |>
filter(is.na(ckd_stage))
nrow(data_noeleg_ckd_na)[1] 22627
- The number of individuals with ineligible age or CKD stages is as follows:
data_noeleg_ckd_age <- data |>
filter(ckd_stage == "Stages 1-2 y 5" | age < 18)
nrow(data_noeleg_ckd_age)[1] 98969
- The number of individuals with missing age data or missing CKD stages data is as follows:
data_noeleg_ckd_age_na <- data |>
filter(is.na(ckd_stage) | is.na(age))
nrow(data_noeleg_ckd_age_na)[1] 22627
- The number of potentially eligible individuals, either due to being within the age or CKD stages range or having missing data, is as follows:
data_eleg_potent <- data |>
filter((age >= 18 | is.na(age)) & (ckd_stage == "Stages 3-4" | is.na(ckd_stage)))
nrow(data_eleg_potent)[1] 53115
- The number of individuals included in stages 3a-4 is as follows:
data_includ <- data_eleg |>
filter(time > 0, !is.na(eventd))
nrow(data_includ)[1] 30031
- The number of individuals included in stages 3b-4 is as follows:
data_includ2 <- data_includ |>
filter(ckd_stage2 == "Stages 3b-4")
nrow(data_includ2)[1] 11540
- The data excluded due to implausible or missing time values are as follows:
datos_exclud_time_implau <- data_eleg |>
filter(time <= 0)
nrow(datos_exclud_time_implau)[1] 366
- The data excluded due to missing time data are as follows:
datos_exclud_time_na <- data_eleg |>
filter(is.na(time))
nrow(datos_exclud_time_na)[1] 91
- The data excluded due to missing outcome status are as follows:
datos_exclud_eventd_na <- data_eleg |>
filter(is.na(eventd))
nrow(datos_exclud_eventd_na)[1] 91
- The data excluded due to missing outcome status/time or implausible time are as follows:
datos_exclud_time_eventd <- data_eleg |>
filter(is.na(eventd) | is.na(time) | time <= 0)
nrow(datos_exclud_time_eventd)[1] 457
5 Flowchart of Participant Selection/Inclusion
# Create grid of 100 x 100----
data_flow <- tibble(x = 1:100, y = 1:100)
data_flow %>%
ggplot(aes(x, y)) +
scale_x_continuous(minor_breaks = seq(1, 100, 1)) +
scale_y_continuous(minor_breaks = seq(1, 100, 1)) +
theme_linedraw() -> p
# Create boxes----
box_xmin <- 33 - 20
box_xmax <- 75 - 20
box_ymin <- 94
box_ymax <- 99
box_size <- 0.25
text_param <- function(box_min, box_max) {
mean(c(box_min, box_max))
}
text_size <- 2
p +
# Column 0----
## Level 1----
geom_rect(xmin = box_xmin, xmax = box_xmax,
ymin = box_ymin - 1, ymax = box_ymax + 1,
color = "black", fill = "white",
size = box_size) +
annotate('text',
x = text_param(box_xmin, box_xmax),
y = text_param(box_ymin - 1, box_ymax + 1),
label = str_glue('Total patients in VISARE database\n(n = {nrow(data)})'),
size = text_size) +
## Level 1.5----
geom_rect(xmin = box_xmin, xmax = box_xmax,
ymin = box_ymin - 27 - 2, ymax = box_ymax - 27 + 2,
color = "black", fill = "white",
size = box_size) +
annotate('text',
x = text_param(box_xmin, box_xmax),
y = text_param(box_ymin - 27, box_ymax - 27),
label = str_glue('Total potentially eligible patients\nwith CKD G3a-G4 and ≥ 18 years\n(n = {nrow(data_eleg_potent)})'),
size = text_size) +
## Level 2----
geom_rect(xmin = box_xmin, xmax = box_xmax,
ymin = box_ymin - 50 - 1, ymax = box_ymax - 50 + 1,
color = "black", fill = "white",
size = box_size) +
annotate('text',
x = text_param(box_xmin, box_xmax),
y = text_param(box_ymin - 50 - 1, box_ymax - 50 + 1),
label = str_glue('Total patients included in the study\n(n = {nrow(data_eleg)})'),
size = text_size) +
# Column -1----
geom_rect(xmin = box_xmin, xmax = box_xmax,
ymin = box_ymin - 81, ymax = box_ymax - 79,
color = "black", fill = "white",
size = box_size) +
annotate('text',
x = text_param(box_xmin, box_xmax),
y = text_param(box_ymin - 81, box_ymax - 79),
label = str_glue('Patients with CKD G3a-G4\n(n = {nrow(data_includ)})'),
size = text_size) +
# Column +1----
## Level 1----
geom_rect(xmin = box_xmin + 23, xmax = box_xmax + 47,
ymin = box_ymin - 19 + 2.5, ymax = box_ymax - 12 + 2.5,
color = "black", fill = "white",
size = box_size) +
annotate('text',
x = text_param(box_xmin + 23, box_xmax + 47),
y = text_param(box_ymin - 19 + 2.5, box_ymax - 12 + 2.5),
label = str_glue(paste0('Not eligible due to age < 18 or CKD at other stages (n = {nrow(data_noeleg_ckd_age)})\n',
'{nrow(data_noeleg_age_menos18)} were < 18 years old\n',
'{nrow(data_noeleg_ckd12)} had normal or slightly reduced eGFR (CKD G1 or G2)\n',
'{nrow(data_noeleg_ckd5)} had renal failure (Stage G5)\n')),
size = text_size) +
## Level 1.5----
geom_rect(xmin = box_xmin + 23, xmax = box_xmax + 47,
ymin = box_ymin - 19 - 25 + 1.5, ymax = box_ymax - 12 - 25 + 1.5,
color = "black", fill = "white",
size = box_size) +
annotate('text',
x = text_param(box_xmin + 23, box_xmax + 47),
y = text_param(box_ymin - 19 - 25 + 1.5, box_ymax - 12 - 25 + 1.5),
label = str_glue(paste0('Excluded due to missing data in selection criteria (n = {nrow(data_noeleg_ckd_age_na)} [{round(100 * nrow(data_noeleg_ckd_age_na)/nrow(data_eleg_potent), 1)}%])\n',
'{nrow(data_noeleg_age_na)} ({round(100 * nrow(data_noeleg_age_na)/nrow(data_eleg_potent), 1)}%) without evaluation date in VISARE\n',
'{nrow(data_noeleg_ckd_na)} ({round(100 * nrow(data_noeleg_ckd_na)/nrow(data_eleg_potent), 1)}%) without eGFR data to define CKD stages\n')),
size = text_size) +
## Level 2----
geom_rect(xmin = box_xmin + 23, xmax = box_xmax + 47,
ymin = box_ymin - 19 - 45 - 2, ymax = box_ymax - 12 - 45 - 2,
color = "black", fill = "white",
size = box_size) +
annotate('text',
x = text_param(box_xmin + 23, box_xmax + 47),
y = text_param(box_ymin - 19 - 45 - 2, box_ymax - 12 - 45 - 2),
label = str_glue(paste0('Excluded due to missing/implausible outcome data (n = {nrow(datos_exclud_time_eventd)} [{round(100 * nrow(datos_exclud_time_eventd)/nrow(data_eleg), 1)}%])\n',
'{nrow(datos_exclud_eventd_na)} ({round(100 * nrow(datos_exclud_eventd_na)/nrow(data_eleg), 1)}%) without time-to-event outcome data\n',
'{nrow(datos_exclud_time_implau)} ({round(100 * nrow(datos_exclud_time_implau)/nrow(data_eleg), 1)}%) with implausible (negative) or irrelevant (zero) event times')),
size = text_size) +
# vertical arrow
geom_segment(x = text_param(box_xmin, box_xmax), xend = text_param(box_xmin, box_xmax),
y = 93, yend = 74,
size = 0.15,
linejoin = "mitre",
lineend = "butt",
arrow = arrow(length = unit(1, "mm"), type = "closed")) +
# vertical arrow
geom_segment(x = text_param(box_xmin, box_xmax), xend = text_param(box_xmin, box_xmax),
y = 65, yend = 50,
size = 0.15,
linejoin = "mitre",
lineend = "butt",
arrow = arrow(length = unit(1, "mm"), type = "closed")) +
geom_segment(x = text_param(box_xmin, box_xmax), xend = text_param(box_xmin, box_xmax),
y = 43, yend = 25,
size = 0.15,
linejoin = "mitre",
lineend = "butt") +
# horizontal arrow 1-->
geom_segment(x = text_param(box_xmin, box_xmax), xend = box_xmin + 23,
y = text_param(box_ymin - 19 + 2, box_ymax - 12 + 2), yend = text_param(box_ymin - 19 + 2, box_ymax - 12 + 2),
size = 0.15,
linejoin = "mitre",
lineend = "butt",
arrow = arrow(length = unit(1, "mm"), type = "closed")) +
# horizontal arrow 2-->
geom_segment(x = text_param(box_xmin, box_xmax), xend = box_xmin + 23,
y = text_param(box_ymin - 19 - 45, box_ymax - 12 - 45), yend = text_param(box_ymin - 19 - 45, box_ymax - 12 - 45),
size = 0.15,
linejoin = "mitre",
lineend = "butt",
arrow = arrow(length = unit(1, "mm"), type = "closed")) +
# horizontal arrow 2-->
geom_segment(x = text_param(box_xmin, box_xmax), xend = box_xmin + 23,
y = text_param(box_ymin - 19 - 25 + 1, box_ymax - 12 - 25 + 1), yend = text_param(box_ymin - 19 - 25 + 1, box_ymax - 12 - 25 + 1),
size = 0.15,
linejoin = "mitre",
lineend = "butt",
arrow = arrow(length = unit(1, "mm"), type = "closed")) +
# vertical arrow -->
geom_segment(x = text_param(box_xmin, box_xmax), xend = text_param(box_xmin, box_xmax),
y = 25, yend = box_ymax - 79,
size = 0.15,
linejoin = "mitre",
lineend = "butt",
arrow = arrow(length = unit(1, "mm"), type = "closed")) +
theme_void() +
theme(plot.background = element_rect(fill = "white")) -> plot_flowchart
ggsave(filename = "plot_flowchart_english.jpeg",
plot = plot_flowchart,
device = "jpeg",
path = here("Figures", "Sensitivity-woWinsorize"),
scale = 1,
width = 12,
height = 12,
units = "cm",
dpi = 1000)knitr::include_graphics(here("Figures", "Sensitivity-woWinsorize", "plot_flowchart_english.jpeg"))6 Data Distribution by Region
data |>
mutate(inclusion = case_when(age >= 18 & time > 0 & ckd_stage == "Stages 3-4" ~ "Incluido",
TRUE ~ "No incluido"),
cas = fct_rev(fct_infreq(cas))) |>
ggplot(aes(x = cas, fill = factor(inclusion, level = c("Incluido", "No incluido")))) +
geom_bar() +
labs(fill = "Inclusión", x = NULL, y = "Frecuencia absoluta") +
coord_flip(expand = FALSE) +
scale_y_continuous(limits = c(0, 30000)) +
theme_bw() -> p1
data |>
mutate(inclusion = case_when(age >= 18 & time > 0 & ckd_stage == "Stages 3-4" ~ "Incluido",
TRUE ~ "No incluido"),
cas = fct_rev(fct_infreq(cas))) |>
ggplot(aes(x = cas, fill = factor(inclusion, level = c("Incluido", "No incluido")))) +
geom_bar(position = "fill") +
labs(fill = "Inclusión", x = NULL, y = "Proporción") +
coord_flip(expand = FALSE) +
scale_y_continuous(limits = c(0, 1)) +
theme_bw() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) -> p2
data |>
mutate(inclusion = case_when(age >= 18 & time > 0 & ckd_stage == "Stages 3-4" ~ "Incluido",
TRUE ~ "No incluido"),
cas = fct_rev(fct_infreq(cas))) |>
filter(inclusion == "Incluido") |>
ggplot(aes(x = cas)) +
geom_bar(fill = "#F8766D") +
labs(x = NULL, y = "Frecuencia absoluta") +
coord_flip(expand = FALSE) +
scale_y_continuous(limits = c(0, 11000)) +
theme_bw() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()) -> p3
(p1 | p3 | p2) +
plot_layout(guides = "collect") -> plot_region
ggsave(filename = "plot_region.jpeg",
plot = plot_region,
device = "jpeg",
path = here("Figures", "Sensitivity-woWinsorize"),
scale = 2,
width = 12,
height = 9,
units = "cm",
dpi = 1000,
bg = "white") 7 Selection of Patients Included in the Study
data %>%
# Criterio de seleccion
filter(age >= 18, time > 0) |>
filter(ckd_stage == "Stages 3-4") -> dataAdataA |>
# Preparacion de datos
select(cas, sex, age, hta, dm, crea, ckd_stage2,
eGFR_ckdepi, acr, urine_album,
urine_crea, time5y, eventd5y,
grf_cat, acr_cat, ckd_class,
death2y, eventd2ylab, death5y,
eventd5ylab, eventd, time) |>
mutate(hta = factor(hta),
dm = factor(dm),
cas2 = case_when(cas %in% c("Lima - Rebagliati", "Lima - Almenara", "Lima - Sabogal") ~ "Lima",
TRUE ~ "Otras Redes"),
cas = fct_rev(fct_infreq(cas)),
cas2 = fct_rev(fct_infreq(cas2))
) |>
# rowwise() |>
mutate(
acr = Winsorize(acr, val = quantile(dataA$acr,
probs = c(0.015, 0.985),
na.rm = TRUE)),
urine_album = Winsorize(urine_album , val = quantile(dataA$urine_album,
probs = c(0.015, 0.985),
na.rm = TRUE)),
urine_crea = Winsorize(urine_crea, val = quantile(dataA$urine_crea,
probs = c(0.015, 0.985),
na.rm = TRUE))) |>
# ungroup() |>
mutate( eventd2ylab = factor(eventd2ylab,
levels = c("Alive w/o Kidney Failure",
"Kidney Failure",
"Death w/o Kidney Failure")),
eventd5ylab = factor(eventd5ylab,
levels = c("Alive w/o Kidney Failure",
"Kidney Failure",
"Death w/o Kidney Failure"))) -> dataA8 Load User-Written Functions
source(here("Code", "source", "kfre_pi.R"))
source(here("Code", "source", "kfre_pr.R"))
source(here("Code", "source", "oe_ratio.R"))
source(here("Code", "source", "calibration_intercept.R"))
source(here("Code", "source", "calibration_slope.R"))
source(here("Code", "source", "auc.R"))
source(here("Code", "source", "validate.mids.R"))
source(here("Code", "source", "pool.validate.mids.R"))
source(here("Code", "source", "pool.auc.mids.R"))
source(here("Code", "source", "predict.mira.R"))
source(here("Code", "source", "process_imp_cal_plot.R"))9 Creation of Cause-Specific Cumulative Hazard Using the Nelson-Aalen Estimator
cumhaz1 <- basehaz(coxph(Surv(time, eventd == 1) ~ 1, data = dataA)) |>
rename(cumhaz1 = hazard)
cumhaz2 <- basehaz(coxph(Surv(time, eventd == 2) ~ 1, data = dataA))|>
rename(cumhaz2 = hazard)
dataA <- dataA |>
left_join(cumhaz1, by = "time") |>
left_join(cumhaz2, by = "time")10 Data Preparation
dataA_imp <- dataA |>
mutate(eventdf = factor(eventd, levels = c(0, 1, 2))) |>
select(-ckd_class, -acr_cat) |>
mutate(
sex_cumhaz1 = as.integer(sex == "Masculino") * (cumhaz1 - mean(cumhaz1, na.rm = TRUE)),
age_cumhaz1 = (age - mean(age, na.rm = TRUE)) * (cumhaz1 - mean(cumhaz1, na.rm = TRUE)),
eGFR_ckdepi_cumhaz1 = (eGFR_ckdepi - mean(eGFR_ckdepi, na.rm = TRUE)) * (cumhaz1 - mean(cumhaz1, na.rm = TRUE)),
sex_cumhaz2 = as.integer(sex == "Masculino") * (cumhaz2 - mean(cumhaz2, na.rm = TRUE)),
age_cumhaz2 = (age - mean(age, na.rm = TRUE)) * (cumhaz2 - mean(cumhaz2, na.rm = TRUE)),
eGFR_ckdepi_cumhaz2 = (eGFR_ckdepi - mean(eGFR_ckdepi, na.rm = TRUE)) * (cumhaz2 - mean(cumhaz2, na.rm = TRUE)),
log_urine_crea = log(urine_crea),
log_urine_album = if_else(is.infinite(log(urine_album)), NA, log(urine_album)),
log_acr = log(acr),
log_acr_cumhaz1 = (log_acr - mean(log_acr, na.rm = TRUE)) * (cumhaz1 - mean(cumhaz1, na.rm = TRUE)),
log_urine_crea_cumhaz1 = (log_urine_crea - mean(log_urine_crea, na.rm = TRUE)) * (cumhaz1 - mean(cumhaz1, na.rm = TRUE)),
log_urine_album_cumhaz1 = (log_urine_album - mean(log_urine_album, na.rm = TRUE)) * (cumhaz1 - mean(cumhaz1, na.rm = TRUE)),
log_acr_cumhaz2 = (log_acr - mean(log_acr, na.rm = TRUE)) * (cumhaz2 - mean(cumhaz2, na.rm = TRUE)),
log_urine_crea_cumhaz2 = (log_urine_crea - mean(log_urine_crea, na.rm = TRUE)) * (cumhaz2 - mean(cumhaz2, na.rm = TRUE)),
log_urine_album_cumhaz2 = (log_urine_album - mean(log_urine_album, na.rm = TRUE)) * (cumhaz2 - mean(cumhaz2, na.rm = TRUE)),
hta_cumhaz1 = as.integer(hta == 1) * (cumhaz1 - mean(cumhaz1, na.rm = TRUE)),
hta_cumhaz2 = as.integer(hta == 1) * (cumhaz2 - mean(cumhaz2, na.rm = TRUE)),
dm_cumhaz1 = as.integer(dm == 1) * (cumhaz1 - mean(cumhaz1, na.rm = TRUE)),
dm_cumhaz2 = as.integer(dm == 1) * (cumhaz2 - mean(cumhaz2, na.rm = TRUE)),
cas2_cumhaz1 = as.integer(cas2 == "Lima") * (cumhaz1 - mean(cumhaz1, na.rm = TRUE)),
cas2_cumhaz2 = as.integer(cas2 == "Lima") * (cumhaz2 - mean(cumhaz2, na.rm = TRUE))
) |>
select(-urine_crea, -urine_album, -acr) |>
relocate(all_of(c("hta", "dm", "log_urine_crea", "log_acr", "log_urine_album")),
.after = eGFR_ckdepi_cumhaz2)
saveRDS(dataA_imp, here("Data", "Tidy", "Sensitivity-woWinsorize", "dataA_imp.rds"))11 Exploration of Missing Data
- The variables to be considered are shown below:
dataA |>
glimpse()Rows: 30,031
Columns: 25
$ cas <fct> Lima - Rebagliati, Lima - Rebagliati, Lima - Rebagliati, L…
$ sex <fct> Femenino, Femenino, Masculino, Femenino, Femenino, Femenin…
$ age <dbl> 22, 22, 93, 50, 66, 79, 65, 20, 61, 78, 59, 82, 18, 78, 80…
$ hta <fct> 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ dm <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ crea <dbl> 2.10, 3.11, 1.58, 2.84, 1.69, 1.37, 1.34, 2.74, 1.12, 1.36…
$ ckd_stage2 <fct> Stages 3b-4, Stages 3b-4, Stages 3b-4, Stages 3b-4, Stages…
$ eGFR_ckdepi <dbl> 32.68936, 20.33397, 37.15369, 18.64175, 31.20480, 36.70976…
$ acr <dbl> NA, 1598.9600, 185.8228, 27817.4550, 10527.1500, 27.9000, …
$ urine_album <dbl> NA, 52.030, 7.340, 1365.173, 416.770, NA, 561.400, 12.360,…
$ urine_crea <dbl> NA, 32.50, 39.50, 46.06, 39.59, NA, 51.30, 42.00, 35.68, 6…
$ time5y <dbl> 2.9842574, 2.8035592, 0.5010267, 5.0000000, 5.0000000, 0.8…
$ eventd5y <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0…
$ grf_cat <fct> G3b, G4, G3b, G4, G3b, G3b, G3b, G4, G3a, G3a, G4, G3b, G3…
$ acr_cat <fct> NA, A3, A2, A3, A3, A1, A3, A2, A1, A3, A3, A3, A2, A2, A2…
$ ckd_class <fct> NA, Very high risk, Very high risk, Very high risk, Very h…
$ death2y <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ eventd2ylab <fct> Alive w/o Kidney Failure, Alive w/o Kidney Failure, Alive …
$ death5y <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0…
$ eventd5ylab <fct> Alive w/o Kidney Failure, Alive w/o Kidney Failure, Alive …
$ eventd <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0…
$ time <dbl> 2.9842574, 2.8035592, 0.5010267, 5.4154689, 6.5817933, 0.8…
$ cas2 <fct> Lima, Lima, Lima, Lima, Lima, Lima, Lima, Lima, Lima, Lima…
$ cumhaz1 <dbl> 0.039036846, 0.037272186, 0.008296835, 0.056070997, 0.0601…
$ cumhaz2 <dbl> 0.12066670, 0.11277426, 0.01513545, 0.25513897, 0.33189356…
- The amount of missing data by variable is as follows:
dataA |>
select(-c(ckd_stage2, time5y, eventd5y, acr_cat,
death2y, eventd2ylab, death5y, eventd5ylab, time, cas)) |>
miss_var_summary() |>
kbl()| variable | n_miss | pct_miss |
|---|---|---|
| urine_album | 21249 | 70.8 |
| acr | 20213 | 67.3 |
| ckd_class | 20213 | 67.3 |
| urine_crea | 16739 | 55.7 |
| dm | 8606 | 28.7 |
| hta | 5715 | 19.0 |
| sex | 0 | 0 |
| age | 0 | 0 |
| crea | 0 | 0 |
| eGFR_ckdepi | 0 | 0 |
| grf_cat | 0 | 0 |
| eventd | 0 | 0 |
| cas2 | 0 | 0 |
| cumhaz1 | 0 | 0 |
| cumhaz2 | 0 | 0 |
Below is a graph showing the pattern of missing data. We can observe a general missing data pattern, characterized by multivariate missing data. No monotone pattern of missing data is observed (as expected), and there is no evidence of file matching. Additionally, the missing data exhibits a connected pattern at both the row and column levels.
dataA |>
select(-c(ckd_stage2, time5y, eventd5y, acr_cat, ckd_class,
death2y, eventd2ylab, death5y, eventd5ylab, time, cas)) |>
plot_pattern(rotate = TRUE) -> plot_patterns
ggsave(filename = "plot_patterns.jpeg",
plot = plot_patterns,
device = "jpeg",
path = here("Figures", "Sensitivity-woWinsorize", "Imputation_Diagnostic"),
scale = 2,
width = 9,
height = 9,
units = "cm",
dpi = 1000,
bg = "white") 12 Influx - outflux
dataA |>
select(-c(ckd_stage2, time5y, eventd5y, acr_cat, ckd_class,
death2y, eventd2ylab, death5y, eventd5ylab, cas)) |>
plot_flux(label = FALSE) -> plot_influx
# Primero, averiguamos cuántas capas hay.
num_layers <- length(plot_influx$layers)
# Examinamos cada capa para encontrar la geom_point() que no deseamos.
# Esto imprimirá las capas y deberías buscar la que contiene la geom_point sin shape.
for (i in 1:num_layers) {
print(plot_influx$layers[[i]])
}mapping: intercept = ~intercept, slope = ~slope
geom_abline: na.rm = FALSE
stat_identity: na.rm = FALSE
position_identity
geom_point: na.rm = FALSE
stat_identity: na.rm = FALSE
position_nudge
# Eliminamos la capa geom_point que no queremos.
# La salida muestra que es la segunda capa, así que la eliminamos.
plot_influx$layers <- plot_influx$layers[-2]
# Asegúrate de tener suficientes formas para cada nivel único de la variable.
unique_vrbs <- unique(plot_influx$data$vrb)
shapes <- seq_along(unique_vrbs)
# Ahora deberías volver a agregar la capa geom_point con las formas y colores adecuados.
plot_influx <- plot_influx +
geom_jitter(aes(shape = vrb, colour = vrb), width = 0.025, height = 0.025) +
scale_shape_manual(values = shapes) +
guides(colour = guide_legend(override.aes = list(shape = shapes)),
shape = FALSE)
ggsave(filename = "plot_influx.jpeg",
plot = plot_influx,
device = "jpeg",
path = here("Figures", "Sensitivity-woWinsorize", "Imputation_Diagnostic"),
scale = 2,
width = 9,
height = 9,
units = "cm",
dpi = 1000,
bg = "white") 13 By region
dataA |>
mutate(miss_acr = if_else(is.na(acr), "Missing", "Completo"),
miss_acr = factor(miss_acr, level = c("Completo", "Missing"))) |>
ggplot(aes(x = cas, fill = miss_acr)) +
geom_bar(position = "fill") +
labs(fill = "Datos perdidos", x = NULL, y = "Proporción") +
coord_flip(expand = FALSE) +
scale_y_continuous(limits = c(0, 1)) +
theme_bw() -> p1
dataA |>
mutate(miss_acr = if_else(is.na(acr), "Missing", "Completo"),
miss_acr = factor(miss_acr, level = c("Completo", "Missing"))) |>
ggplot(aes(x = cas, fill = miss_acr)) +
geom_bar() +
labs(fill = "Datos perdidos", x = NULL, y = "Frecuencia absoluta") +
coord_flip(expand = FALSE) +
scale_y_continuous(limits = c(0, 10000)) +
theme_bw() -> p2
(p1 | p2) + plot_layout(guides = "collect") -> plot_region_missing
ggsave(filename = "plot_region_missing.jpeg",
plot = plot_region_missing,
device = "jpeg",
path = here("Figures", "Sensitivity-woWinsorize", "Imputation_Diagnostic"),
scale = 2,
width = 16,
height = 8,
units = "cm",
dpi = 1000,
bg = "white") dataA |>
mutate(miss_acr = if_else(is.na(acr), "Missing", "Completo"),
miss_acr = factor(miss_acr, level = c("Completo", "Missing"))) |>
ggplot(aes(x = cas2, fill = miss_acr)) +
geom_bar(position = "fill") +
labs(fill = "Datos perdidos", x = NULL, y = "Proporción") +
coord_flip(expand = FALSE) +
scale_y_continuous(limits = c(0, 1)) +
theme_bw() -> p1
dataA |>
mutate(miss_acr = if_else(is.na(acr), "Missing", "Completo"),
miss_acr = factor(miss_acr, level = c("Completo", "Missing"))) |>
ggplot(aes(x = cas2, fill = miss_acr)) +
geom_bar() +
labs(fill = "Datos perdidos", x = NULL, y = "Frecuencia absoluta") +
coord_flip(expand = FALSE) +
scale_y_continuous(limits = c(0, 16000)) +
theme_bw() -> p2
(p1 | p2) + plot_layout(guides = "collect") -> plot_region_missing
ggsave(filename = "plot_region_missing2.jpeg",
plot = plot_region_missing,
device = "jpeg",
path = here("Figures", "Sensitivity-woWinsorize", "Imputation_Diagnostic"),
scale = 2,
width = 16,
height = 6,
units = "cm",
dpi = 1000,
bg = "white") 14 Ticket de Reprocubilidad
sessionInfo()R version 4.4.1 (2024-06-14)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
locale:
[1] LC_CTYPE=es_ES.UTF-8 LC_NUMERIC=C
[3] LC_TIME=es_ES.UTF-8 LC_COLLATE=es_ES.UTF-8
[5] LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=es_ES.UTF-8
[7] LC_PAPER=es_ES.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
time zone: America/Lima
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] knitr_1.48 ggmice_0.1.0 naniar_1.1.0 survival_3.7-0
[5] janitor_2.2.0 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
[9] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
[13] ggplot2_3.5.1 tidyverse_2.0.0 DescTools_0.99.56 patchwork_1.2.0
[17] kableExtra_1.4.0 dplyr_1.1.4 rio_1.2.2 here_1.0.1
[21] pacman_0.5.1
loaded via a namespace (and not attached):
[1] gld_2.6.6 readxl_1.4.3 rlang_1.1.4 magrittr_2.0.3
[5] snakecase_0.11.1 e1071_1.7-14 compiler_4.4.1 systemfonts_1.1.0
[9] vctrs_0.6.5 pkgconfig_2.0.3 shape_1.4.6.1 fastmap_1.2.0
[13] backports_1.5.0 labeling_0.4.3 utf8_1.2.4 rmarkdown_2.28
[17] tzdb_0.4.0 nloptr_2.1.1 visdat_0.6.0 ragg_1.3.2
[21] xfun_0.47 glmnet_4.1-8 jomo_2.7-6 jsonlite_1.8.8
[25] highr_0.11 pan_1.9 jpeg_0.1-10 broom_1.0.6
[29] R6_2.5.1 stringi_1.8.4 rpart_4.1.23 boot_1.3-30
[33] cellranger_1.1.0 Rcpp_1.0.13 iterators_1.0.14 nnet_7.3-19
[37] Matrix_1.7-0 splines_4.4.1 timechange_0.3.0 tidyselect_1.2.1
[41] rstudioapi_0.16.0 yaml_2.3.10 codetools_0.2-20 lattice_0.22-5
[45] withr_3.0.1 evaluate_0.24.0 proxy_0.4-27 xml2_1.3.6
[49] pillar_1.9.0 mice_3.16.0 foreach_1.5.2 generics_0.1.3
[53] rprojroot_2.0.4 hms_1.1.3 munsell_0.5.1 scales_1.3.0
[57] rootSolve_1.8.2.4 minqa_1.2.8 class_7.3-22 glue_1.7.0
[61] lmom_3.0 tools_4.4.1 data.table_1.16.0 lme4_1.1-35.5
[65] Exact_3.3 mvtnorm_1.3-1 grid_4.4.1 colorspace_2.1-1
[69] nlme_3.1-165 cli_3.6.3 textshaping_0.4.0 fansi_1.0.6
[73] expm_1.0-0 viridisLite_0.4.2 svglite_2.1.3 gtable_0.3.5
[77] digest_0.6.37 htmlwidgets_1.6.4 farver_2.1.2 htmltools_0.5.8.1
[81] lifecycle_1.0.4 httr_1.4.7 mitml_0.4-5 MASS_7.3-61